**06/28/2021
**file to bring in CPD FOIA complaints files received in 2021 & assign unique IDs
 
**Housekeepig
local ver_suff _9_21
clear all
 
set more off


//later pass through profiles dataset to fuzzy match 
local profiles_match `1'

*******************
*BRING IN COMPLAINTS DATA
*******************
**Read main complants data
use ../processed_data`ver_suff'/complaints_2021, clear


tempfile complaints_old
save `complaints_old'		   

**Read recent complaints data
use ../processed_data`ver_suff'/complaints_cms, clear

**Standardize finding variable across datasets
rename recommended_finding finding_cd
replace finding_cd = upper(finding_cd) 

**Add back main complaints data
append using `complaints_old'		   

**Store officer name
gen fname = trim(acc_fname)
gen lname = trim(acc_lname)
gen mi = trim(acc_mi)

*Store officer appointment year
gen appoint_yr = year(acc_appoint_dt)
gen appoint_m = month(acc_appoint_dt)
gen appoint_d = day(acc_appoint_dt)		   

**Store officer birthyear
gen birthyear = acc_byr

**Standardize form of missing middle initial
replace mi = "" if mi == "NULL"

**Clean some clearly errouneous name information
sort fname lname mi appoint_yr appoint_m appoint_d birthyear 
drop if lname == ""
drop if fname == ""
drop if fname == "Unknown" 
drop if substr(fname,1,1)<"A"
drop if substr(fname,1,1)>"Z"
drop if fname == "White man"
drop if fname == "Will county"
drop if fname == "Wouldn't identify themselves"
drop if fname == "Test"
drop if fname == "Tall"
drop if fname == "Toney with a Y"
drop if fname == "U FIGURE"
drop if fname == "UNKNOWN"
drop if fname == "UNK"
drop if fname == "UN"
drop if fname == "The Entire Chicago Police"
drop if strpos(fname, "Unknown")
drop if strpos(fname, "Unknwn")
drop if fname == "Any"
drop if fname == "B"
drop if fname == "Chicago"
drop if fname == "Chicago land alldirection Northsouth east west."
drop if fname == "Chief"
drop if fname == "City of Chicago"
drop if fname == "Cop"
drop if fname == "Cumsucking pig"
drop if fname == "Kenneth Capp _ James Entree 9th District Police Corruption"
drop if strpos(fname, "No police")
drop if strpos(fname, "None")
drop if strpos(fname, "Applicable")
drop if strpos(fname, "Not")
drop if strpos(fname, "Noone")
drop if strpos(fname, "Online")
drop if strpos(fname, "Refused")
drop if strpos(fname, "River Place")
drop if strpos(fname, "Dumb")
drop if strpos(fname, "I do")
drop if strpos(lname, "Kingdom")
drop if strpos(fname, "Many unknown")
drop if strpos(fname, "N/")
drop if strpos(fname, "NA")
drop if strpos(fname, "Nine")
drop if strpos(fname, "No Department")
drop if strpos(fname, "Name")
drop if fname == "Na" 
drop if fname == "N" 
replace fname = subinstr(fname, "Officer", "", .)
replace fname = subinstr(fname, "Sergeant", "", .)
replace fname = subinstr(fname, "Sgt", "", .)
replace fname = subinstr(fname, "Mr.", "", .)
replace fname = "" if fname == "Mr"
drop if strpos(fname, "SHOULD NOT")
drop if strpos(fname, "Some")
drop if strpos(fname, "Chicago")
drop if lname == "Doe female shooter jumpertyee misconduct"
drop if lname == "?"
drop if lname == "Iiiiiii"
drop if strpos(lname, "Doe ")
drop if strpos(lname, "Does")
drop if strpos(lname, "N/a")
drop if strpos(fname, "John doe")
drop if strpos(lname, "a Few")
drop if strpos(lname, "Alpesh and Carlton Nelson and Debra and nancy")
drop if strpos(lname, "Daley  Standing Office at Mayor")
drop if strpos(fname, "St Charles Parish")
drop if strpos(lname, "Annas baby daddy the green house on with the flags on lincoln and sacremento")
drop if strpos(lname, "Population")
drop if strpos(fname, "Unknow")
drop if strpos(fname, "Unkown")
drop if strpos(fname, "Upstairs")
drop if strpos(fname, "See Video")
drop if strpos(fname, "Morgan park at")

//some obs have variations of last names: deflate last names
gen lname_mod = subinstr(lname, "-", "", .)
replace lname_mod = subinstr(lname_mod, " ", "", .)
replace lname_mod = subinstr(lname_mod, ".", "", .)

 
**Save intermediate version of complaints
save ../processed_data`ver_suff'/FOIA_processing/complaints, replace
  


*******************
*DEDUPLICATE COMPLAINTS DATA
******************* 

**Deduplicate with respect to matching variables
duplicates drop fname lname_mod mi appoint_yr appoint_m appoint_d birthyear , force
keep fname lname lname_mod mi appoint_yr appoint_m appoint_d birthyear acc_race acc_male

 **Generate preliminary ID
gen pid = _n 

**Create name soundexes
gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)

**Standardize race and sex information
rename acc_race race
gen sex = "M" if acc_male ==1
replace sex = "F" if acc_male == 0

//if missing birthyear or appointed dates give data from obs that share the same names
sort fname lname_mod appoint_yr appoint_m appoint_d birthyear
replace pid = pid[_n-1] if fname == fname[_n-1] & lname_mod == lname_mod[_n-1] & appoint_yr == appoint_yr[_n-1]& appoint_m == appoint_m[_n-1]& appoint_d == appoint_d[_n-1] & birthyear == .
replace pid = pid[_n-1] if fname == fname[_n-1] & lname_mod == lname_mod[_n-1] & birthyear == birthyear[_n-1] & appoint_yr == .
replace pid = pid[_n-1] if fname == fname[_n-1] & lname_mod == lname_mod[_n-1] & birthyear == . & appoint_yr == .

**Save dataset with preliminary IDs
save ../processed_data`ver_suff'/FOIA_processing/complaints_pid, replace
clear



*******************
*SET UP LIST OF DATA TO STORE MATCHED IDS
*******************
clear 

set obs 1
gen u_pid = -1
gen pid = -1

save ../processed_data`ver_suff'/FOIA_processing/pid_appending, replace
clear


*******************
*FUZZY MATCHING
*******************

use ../processed_data`ver_suff'/FOIA_processing/complaints_pid, clear // start with 18,544



**Perfect name, appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 1 //27,746 so far



**Perfect name, appoint date match, 1 year birth year gap
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 1 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 2  



**Perfect name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 3  


**Perfect first name, soundex last name, appoint date, and birth year match, no middle initial
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 4  


**Soundex both names, middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 1 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 5  


**Soundex both names, no middle initial, and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 6  


**first name, no middle initial, no last name and appoint date, birth year match
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 0 0 2 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 7  

 

**first name, last name, off in appoint_yr 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 1 2 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 9  


**first name, last name, off in appoint_mo 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 1 2 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 10  


**first name, last name, off in appoint_day 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 1 2 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 11  


**first name, last name, off in birth year 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0 2 2 2 1 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 12  


**first name, last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 2 2 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 13  

**soundex first name, soundex last name, no other match 
do ../do_files`ver_suff'/fuzzy_merge_pid_Kim 1 1 0  3 3 3 3 `profiles_match' u_pid ../processed_data`ver_suff'/FOIA_processing/pid_appending 14  
  
 
 
 
*******************
*KEEP BEST MATCHES
******************* 

use ../processed_data`ver_suff'/FOIA_processing/pid_appending.dta, clear

//drop the less good matches if individuals in awards data are matched with more than one in profiles

sort pid match_no u_pid
quietly by pid:  gen dup = cond(_N==1,0,_n)
sort dup
tab dup

drop if dup>1 
drop dup 
sort pid match_no

tempfile best_matches
save `best_matches'

*******************
*MERGE DE-DUPLICATED COMPLAINTS DATA WITH MATCHINGS
******************* 

use ../processed_data`ver_suff'/FOIA_processing/complaints_pid, clear

merge m:1 pid using `best_matches', keep(1 3)  





